import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
Credits: based on https: // www.kaggle.com/crawford/principle-component-analysis-gene-expression/notebook
https://www.kaggle.com/crawford/principle-component-analysis-gene-expression/
Datos usados para clasificar pacientes con acute myeloid leukemia (AML) and acute lymphoblastic leukemia (ALL).
Golub et al "Molecular Classification of Cancer: Class Discovery and Class Prediction by Gene Expression Monitoring"
There are two datasets containing the initial (training, 38 samples) and independent (test, 34 samples) datasets used in the paper. These datasets contain measurements corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood. Intensity values have been re-scaled such that overall intensities for each chip are equivalent.
testfile = 'data_set_ALL_AML_independent.csv'
trainfile = 'data_set_ALL_AML_train.csv'
labels = 'genes.actual.csv'
X_train = pd.read_csv(trainfile)
X_test = pd.read_csv(testfile)
y = pd.read_csv(labels)
# 1) Remove "call" columns from training a test
train_keepers = [col for col in X_train.columns if "call" not in col]
test_keepers = [col for col in X_test.columns if "call" not in col]
X_train = X_train[train_keepers]
X_test = X_test[test_keepers]
# 2) Transpose
X_train = X_train.T
X_test = X_test.T
X_train.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 7119 | 7120 | 7121 | 7122 | 7123 | 7124 | 7125 | 7126 | 7127 | 7128 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gene Description | AFFX-BioB-5_at (endogenous control) | AFFX-BioB-M_at (endogenous control) | AFFX-BioB-3_at (endogenous control) | AFFX-BioC-5_at (endogenous control) | AFFX-BioC-3_at (endogenous control) | AFFX-BioDn-5_at (endogenous control) | AFFX-BioDn-3_at (endogenous control) | AFFX-CreX-5_at (endogenous control) | AFFX-CreX-3_at (endogenous control) | AFFX-BioB-5_st (endogenous control) | ... | Transcription factor Stat5b (stat5b) mRNA | Breast epithelial antigen BA46 mRNA | GB DEF = Calcium/calmodulin-dependent protein ... | TUBULIN ALPHA-4 CHAIN | CYP4B1 Cytochrome P450; subfamily IVB; polypep... | PTGER3 Prostaglandin E receptor 3 (subtype EP3... | HMG2 High-mobility group (nonhistone chromosom... | RB1 Retinoblastoma 1 (including osteosarcoma) | GB DEF = Glycophorin Sta (type A) exons 3 and ... | GB DEF = mRNA (clone 1A7) |
| Gene Accession Number | AFFX-BioB-5_at | AFFX-BioB-M_at | AFFX-BioB-3_at | AFFX-BioC-5_at | AFFX-BioC-3_at | AFFX-BioDn-5_at | AFFX-BioDn-3_at | AFFX-CreX-5_at | AFFX-CreX-3_at | AFFX-BioB-5_st | ... | U48730_at | U58516_at | U73738_at | X06956_at | X16699_at | X83863_at | Z17240_at | L49218_f_at | M71243_f_at | Z78285_f_at |
| 1 | -214 | -153 | -58 | 88 | -295 | -558 | 199 | -176 | 252 | 206 | ... | 185 | 511 | -125 | 389 | -37 | 793 | 329 | 36 | 191 | -37 |
| 2 | -139 | -73 | -1 | 283 | -264 | -400 | -330 | -168 | 101 | 74 | ... | 169 | 837 | -36 | 442 | -17 | 782 | 295 | 11 | 76 | -14 |
| 3 | -76 | -49 | -307 | 309 | -376 | -650 | 33 | -367 | 206 | -215 | ... | 315 | 1199 | 33 | 168 | 52 | 1138 | 777 | 41 | 228 | -41 |
5 rows × 7129 columns
# 3) Clean up the column names for training data
X_train.columns = X_train.iloc[1]
X_train = X_train.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)
# Clean up the column names for training data
X_test.columns = X_test.iloc[1]
X_test = X_test.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)
X_train.head()
| Gene Accession Number | AFFX-BioB-5_at | AFFX-BioB-M_at | AFFX-BioB-3_at | AFFX-BioC-5_at | AFFX-BioC-3_at | AFFX-BioDn-5_at | AFFX-BioDn-3_at | AFFX-CreX-5_at | AFFX-CreX-3_at | AFFX-BioB-5_st | ... | U48730_at | U58516_at | U73738_at | X06956_at | X16699_at | X83863_at | Z17240_at | L49218_f_at | M71243_f_at | Z78285_f_at |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | -214 | -153 | -58 | 88 | -295 | -558 | 199 | -176 | 252 | 206 | ... | 185 | 511 | -125 | 389 | -37 | 793 | 329 | 36 | 191 | -37 |
| 2 | -139 | -73 | -1 | 283 | -264 | -400 | -330 | -168 | 101 | 74 | ... | 169 | 837 | -36 | 442 | -17 | 782 | 295 | 11 | 76 | -14 |
| 3 | -76 | -49 | -307 | 309 | -376 | -650 | 33 | -367 | 206 | -215 | ... | 315 | 1199 | 33 | 168 | 52 | 1138 | 777 | 41 | 228 | -41 |
| 4 | -135 | -114 | 265 | 12 | -419 | -585 | 158 | -253 | 49 | 31 | ... | 240 | 835 | 218 | 174 | -110 | 627 | 170 | -50 | 126 | -91 |
| 5 | -106 | -125 | -76 | 168 | -230 | -284 | 4 | -122 | 70 | 252 | ... | 156 | 649 | 57 | 504 | -26 | 250 | 314 | 14 | 56 | -25 |
5 rows × 7129 columns
# 4) Split into train and test
X_train = X_train.reset_index(drop=True)
y_train = y[y.patient <= 38].reset_index(drop=True)
# Subet the rest for testing
X_test = X_test.reset_index(drop=True)
y_test = y[y.patient > 38].reset_index(drop=True)
Realiza un análisis exploratorio de los datos (correlaciones entre sí y con las clases, distribuciones,...). Usa las técnicas y gráficos que te parezcan más representativos.
Graficamos el diagrama de correlación entre los atributos.
def correlation_matrix(df):
from matplotlib import pyplot as plt
from matplotlib import cm as cm
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(111)
cmap = cm.get_cmap('jet', 30)
cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
ax1.grid(True)
plt.title('Wine data set features correlation\n',fontsize=15)
labels=df.columns
ax1.set_xticklabels(labels,fontsize=9)
ax1.set_yticklabels(labels,fontsize=9)
# Add colorbar, make sure to specify tick locations to match desired ticklabels
fig.colorbar(cax, ticks=[0.1*i for i in range(-11,11)])
plt.show()
df = pd.DataFrame(X_train)
correlation_matrix(df)
C:\Users\adria\AppData\Local\Temp/ipykernel_15720/3989663827.py:12: UserWarning: FixedFormatter should only be used together with FixedLocator C:\Users\adria\AppData\Local\Temp/ipykernel_15720/3989663827.py:13: UserWarning: FixedFormatter should only be used together with FixedLocator
Vemos como principalemnte predomina el color verde, indicando una correlación 0 entre la mayoría de atributos. Aún así podemos ver suficientes tonos azulados o rojidos, indicandonos que hay varios atributos muy correlacionados.
The analysis reveals that 21 principle components are needed to account for 80% of the variance. PC 1-3 add up to about ~33% and the rest is a slow burn where each component after PC8 contributes between 1-2% of the variance up until PC38 which is essentially zero. 1% is a decent amonut of variance and so the number of important PCs is up for interpretation.
# 5) Scale data
# (1) YOUR CODE HERE: Use the StandardScaler (separately for train and test sets)
scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.fit_transform(X_test)
X_train[:3]
| Gene Accession Number | AFFX-BioB-5_at | AFFX-BioB-M_at | AFFX-BioB-3_at | AFFX-BioC-5_at | AFFX-BioC-3_at | AFFX-BioDn-5_at | AFFX-BioDn-3_at | AFFX-CreX-5_at | AFFX-CreX-3_at | AFFX-BioB-5_st | ... | U48730_at | U58516_at | U73738_at | X06956_at | X16699_at | X83863_at | Z17240_at | L49218_f_at | M71243_f_at | Z78285_f_at |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -214 | -153 | -58 | 88 | -295 | -558 | 199 | -176 | 252 | 206 | ... | 185 | 511 | -125 | 389 | -37 | 793 | 329 | 36 | 191 | -37 |
| 1 | -139 | -73 | -1 | 283 | -264 | -400 | -330 | -168 | 101 | 74 | ... | 169 | 837 | -36 | 442 | -17 | 782 | 295 | 11 | 76 | -14 |
| 2 | -76 | -49 | -307 | 309 | -376 | -650 | 33 | -367 | 206 | -215 | ... | 315 | 1199 | 33 | 168 | 52 | 1138 | 777 | 41 | 228 | -41 |
3 rows × 7129 columns
# 6) PCA Analysis and projection
components = 21
# YOUR CODE HERE:
# (2) Use PCA with this number of components on train set, with Y the result of the procedure
pca = PCA(n_components=components)
Y = pca.fit(X_train_scl)
# (3) Retrieve the explained variance ratio, and compute its accumulative sum
# save those values in variables var_exp and cum_var_exp
var_exp = Y.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
print(var_exp)
print(cum_var_exp)
[0.14987793 0.11977811 0.06600567 0.04884913 0.04632409 0.03721951 0.03490898 0.03289557 0.02985114 0.02644557 0.02509463 0.02357098 0.02203596 0.02084089 0.01940031 0.01892476 0.01845337 0.01712219 0.01703644 0.01640926 0.01532619] [0.14987793 0.26965604 0.33566171 0.38451084 0.43083493 0.46805444 0.50296342 0.53585899 0.56571013 0.5921557 0.61725033 0.64082131 0.66285728 0.68369817 0.70309848 0.72202324 0.74047661 0.7575988 0.77463523 0.79104449 0.80637069]
Pregunta (1): ¿Qué pauta puede observarse en los valores de var_exp? ¿Cuál es la interpretación relativa de esos valores?
Podemos observar la distinta varianza que aporta cada variable. La interpretación relativa nos explica el total de varianza según se van añadiendo variables al modelo
# Plot the explained variance using var_exp and cum_var_exp
x = ["PC%s" %i for i in range(1,components)]
trace1 = go.Bar(
x=x,
y=list(var_exp),
name="Explained Variance")
trace2 = go.Scatter(
x=x,
y=cum_var_exp,
name="Cumulative Variance")
layout = go.Layout(
title='Explained variance',
xaxis=dict(title='Principle Components', tickmode='linear'))
data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)
The first three components only explain 33% of the variance but we'll go ahead plot the projection to get a visual of it.
# Project first three components
Y_train_pca = pca.fit_transform(X_train_scl)
traces = []
for name in ['ALL', 'AML']:
trace = go.Scatter3d(
x=Y_train_pca[y_train.cancer == name, 0],
y=Y_train_pca[y_train.cancer == name, 1],
z=Y_train_pca[y_train.cancer == name, 2],
mode='markers',
name=name,
marker=go.Marker(size=10, line=go.Line(width=1), opacity=1))
traces.append(trace)
layout = go.Layout(
xaxis=dict(title='PC1'),
yaxis=dict(title='PC2'),
title="Projection of First Three Principle Components"
)
data = traces
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)
/home/lilg8b/.local/lib/python3.8/site-packages/plotly/graph_objs/_deprecations.py:378: DeprecationWarning: plotly.graph_objs.Line is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.scatter.Line - plotly.graph_objs.layout.shape.Line - etc. /home/lilg8b/.local/lib/python3.8/site-packages/plotly/graph_objs/_deprecations.py:434: DeprecationWarning: plotly.graph_objs.Marker is deprecated. Please replace it with one of the following more specific types - plotly.graph_objs.scatter.Marker - plotly.graph_objs.histogram.selected.Marker - etc.
Pregunta(2): Modificando la perspectiva de la figura con el ratón, ¿qué observas en cuanto a la separabilidad de las clases? Adjunta una imagen que apoye tus conclusiones.
Aquí aportamos diferentes capturas que nos ayudarán a ver las potenciales diferencias de cada eje.
Se puede ver que la clase roja toma los valores de Y mas pequeños [-40, -20], Y la clase azul toma los mas grandes, entre [-20, 40], incluso con outlayers que toman valores 80 o superiores a 100.
Se puede ver que la clase roja toma los valores de Z mas grandes, van de [0,50], mientras que ka clase azul toma valores Z mas pequeós, entre [-40,0]
En cuanto al eje X podemos observar que la clase roja eliminando los dos valores mas extremos o outlayers, tienden al valor 20, estableciendose entre el siguiente rango de valores de [0,40]. La clase azul toma todos los valores, concentrandose en X[0.-40] principalmente, pero también tomando valores de [0,60]. Principalmente, podríamos decir que la clase azul para el eje X es dividido en el numero 0, donde no hay casos, con una mayor condensacion para X negativa
Con estas conclusiones, sería posible asumir: Que si una variable, para los ejes de manera [X,Y,Z], a grandes rasgos puede ser clasificada de las siguientes maneras:
Pertenecerá a la clase roja si toma los valores:
$$[x>0, y<0, z>0]$$En cambio pertenecerá a la clase azul si toma los valores de la forma:
$$[x != 0 || x < 0, y>0, z<0]$$Realizar un análisis similar usando LDA, usando en este caso la información sobre el tipo de cancer de cada paciente. Puedes seguir la guía en
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(y_train["cancer"])
y = label_encoder.transform(y_train["cancer"]) + 1
label_dict = {1: 'ALL', 2: 'AML'}
# LDA
sklearn_lda = LDA(n_components=1)
X_lda_sklearn = sklearn_lda.fit_transform(X_train.values, y)
# plotting code (from https://www.apsl.net/blog/2017/07/18/using-linear-discriminant-analysis-lda-data-explore-step-step/, you may need to adapt it)
from matplotlib import pyplot as plt
import numpy as np
def plot_step_lda():
ax = plt.subplot(111)
for label, marker, color in zip(
range(1, 4), ('^', 's', 'o'), ('blue', 'red', 'green')):
plt.scatter(x=X_lda_sklearn[:, 0].real[y == label],
y=np.zeros(len(X_lda_sklearn[:, 0].real[y == label])),
marker=marker,
color=color,
alpha=0.5,
label=label_dict[label]
)
plt.xlabel('LD1')
plt.ylabel('LD2')
leg = plt.legend(loc='upper right', fancybox=True)
leg.get_frame().set_alpha(0.5)
plt.title('LDA: Iris projection onto the first 2 linear discriminants')
# hide axis ticks
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.grid()
plt.tight_layout
plt.show()
def plot_scikit_lda(X, title):
ax = plt.subplot(111)
for label, marker, color in zip(
range(1, 3), ('^', 's'), ('blue', 'red')):
print()
plt.scatter(x=X[:, 0][y == label],
y=np.zeros(len(X_lda_sklearn[:, 0].real[y == label])), # flip the figure
marker=marker,
color=color,
alpha=0.5,
label=label_dict[label])
plt.xlabel('LD1')
plt.ylabel('LD2')
leg = plt.legend(loc='upper right', fancybox=True)
leg.get_frame().set_alpha(0.5)
plt.title(title)
# hide axis ticks
plt.tick_params(axis='both', which='both', bottom='off', top='off',
labelbottom='on', left='off', right='off', labelleft='on')
# remove axis spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.grid()
plt.tight_layout
plt.show()
plot_scikit_lda(X_lda_sklearn, title='Default LDA via scikit-learn')
Utiliza k-means clustering con los datos originales y con los datos proyectados con PCA y LDA. ¿Qué observas?
## YOUR CODE HERE
import matplotlib.pyplot as plt
# KMEANS
nclusters=2
kmeans = KMeans(n_clusters=nclusters, random_state=0)
t_train_km = kmeans.fit_predict(X_train_scl)
plt.figure(figsize=(12, 12))
plt.subplot(223)
plt.scatter(X_train_scl[:, 0], X_train_scl[:, 1], c=t_train_km, label =t_train_km )
plt.title("kMeans datos originales")
plt.xlabel('Attrib 0')
plt.ylabel('Attrib 1')
Text(0, 0.5, 'Attrib 1')
#PCA
x_pca = Y.transform(X_train_scl)
nclusters=2
kmeans = KMeans(n_clusters=nclusters, random_state=0)
y_train_km_pca = kmeans.fit_predict(x_pca)
plt.figure()
colors = ["orange", "turquoise"]
lw = 2
x_pca_df = pd.DataFrame(x_pca)
y_train_km_pca_df = pd.DataFrame(y_train_km_pca,columns = ['cancer'])
df = pd.concat([x_pca_df,y_train_km_pca_df ], axis=1)
zero = df.loc[df['cancer'] == 0].to_numpy()
plt.scatter(zero[ 0], zero[ 1], c='orange', alpha=0.8, lw=lw)
one = df.loc[df['cancer'] == 1].to_numpy()
plt.scatter(one[ 0], one[ 1], c='blue', alpha=0.8, lw=lw)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA LDA with k-Means Clustering")
plt.xlabel('PCA1')
plt.ylabel('PCA2')
No handles with labels found to put in legend.
Text(0, 0.5, 'PCA2')
#LDA
y_lda = kmeans.fit_predict(X_lda_sklearn)
plt.figure()
colors = ["orange", "turquoise"]
lw = 2
x_lda_df = pd.DataFrame(X_lda_sklearn)
y_train_km_lda_df = pd.DataFrame(y_lda,columns = ['cancer'])
df2 = pd.concat([x_lda_df,y_train_km_lda_df ], axis=1)
zero = df2.loc[df['cancer'] == 0].to_numpy()
zeros= np.zeros_like(zero)
plt.scatter(zero,zeros, c='orange', alpha=0.8, lw=lw)
one = df2.loc[df['cancer'] == 1].to_numpy()
ones= np.zeros_like(one)
plt.scatter(one, ones, c='blue', alpha=0.8, lw=lw)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("LDA with k-Means Clustering")
plt.xlabel('LD1')
plt.ylabel('Zeros')
No handles with labels found to put in legend.
Text(0, 0.5, 'Zeros')
Observamos como el cluster mas ddefinido es el de LDA, a pesar de tener solo 1 dimensión. Esto es porque LDA, hace clasficación sobre las clases, y luego hacemos clustering sobre esa clasificación, por eso queda tan bien definido y separados los clústers.
En cambio en PCA, como su objetivo es la representación en en menos dimensiones sin clasificación, a la hora de hacer clustering, no vemos resultados mejores que con los datos originales.
Con los datos originales tampoco obtenemos unos clusters bien definidos, los datos están bastante dispersos. Por lo que podemos concluir que la mejor forma de hacer clustering es aplicar antes LDA, que gracias a su clasificación, nos da 2 cluesters muy definidos.